Introduction

This analysis aims to explore crime data from Los Angeles using the dataset provided by Data.gov, covering the period starting from 2020. The dataset can be accessed here.

Notes from dataset description:
- This data is transcribed from original crime reports that are typed on paper and therefore there may be some inaccuracies within the data.
- Some location fields with missing data are noted as (0°, 0°).
- Address fields are only provided to the nearest hundred block in order to maintain privacy.

Loading Dataset

library(tidyverse)
library(dplyr)
library(ggplot2)
library(leaflet)
library(sp)
library(KernSmooth)
library(raster)
library(e1071)
library(data.table)
library(ggcorrplot)
library(plotly)

Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
data_main <- read_csv('C:/Users/Przemysław/Desktop/msad/Crime_Data_from_2020_to_Present.csv',show_col_types = FALSE)

Initial dataset description

head(data_main)

{r}{r echo=T, results='hide'} str(data_main)

{r}{r echo=T, results='hide'} summary(data_main)

Missing values investigation

apply(data_main, 2, function(x) any(is.na(x)))
##          DR_NO      Date Rptd       DATE OCC       TIME OCC           AREA 
##          FALSE          FALSE          FALSE          FALSE          FALSE 
##      AREA NAME    Rpt Dist No       Part 1-2         Crm Cd    Crm Cd Desc 
##          FALSE          FALSE          FALSE          FALSE          FALSE 
##        Mocodes       Vict Age       Vict Sex   Vict Descent      Premis Cd 
##           TRUE          FALSE           TRUE           TRUE           TRUE 
##    Premis Desc Weapon Used Cd    Weapon Desc         Status    Status Desc 
##           TRUE           TRUE           TRUE          FALSE          FALSE 
##       Crm Cd 1       Crm Cd 2       Crm Cd 3       Crm Cd 4       LOCATION 
##           TRUE           TRUE           TRUE           TRUE          FALSE 
##   Cross Street            LAT            LON 
##           TRUE          FALSE          FALSE

There are 10 columns containing N/A values. Moreover, based on the dataset description, the Location columns are converted to (0°, 0°) for entries with missing values. However, there are no zeroed LAT or LNG values in the actual dataset.

sum(data_main$LAT == 0 | data_main$LNG == 0)
## [1] 0

The percentage impact of the missing values.

missing_percentage <- colMeans(is.na(data_main)) * 100
missing_df <- data.frame(Column = names(missing_percentage), Percent_Missing = missing_percentage)

ggplot(missing_df, aes(x = Column, y = Percent_Missing)) +
  geom_bar(stat = "identity", fill = "skyblue", width = 0.5) +
  labs(title = "Percentage of Missing Data per Column", x = "Column", y = "Percent Missing") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Based on the missing values percentage, we can drop Crm Cd 3 and Crm Cd 4 as those columns do not contain other values than null.

Values for Mocodes , Vict Descent and Vict Sex could be MNAR (Missing Not At Random) values that are logically associated with each other. The same for Weapon Desc and Weapon Used Cd .

missing_rows_mocodes <- which(is.na(data_main$Mocodes))
missing_rows_descent <- which(is.na(data_main$'Vict Descent'))
missing_rows_sex <- which(is.na(data_main$'Vict Sex'))

#Check if missing values for Vict Sex, Vict Descent and Mocodes are for the same rows. 
(identical(missing_rows_mocodes, missing_rows_descent) 
&& identical(missing_rows_mocodes, missing_rows_sex)
  && identical(missing_rows_descent, missing_rows_sex))
## [1] FALSE

There is no exact matching on Mocodes , Vict Descent and Vict Sex . However Vict Descent and Vict Sex are matching in 99.99% of cases.

length(intersect(missing_rows_descent, missing_rows_sex)) / length(missing_rows_descent) * 100
## [1] 99.98901

Mocodes and Vict Sex are matching in 94.85%.

length(intersect(missing_rows_mocodes, missing_rows_sex)) / length(missing_rows_mocodes) * 100
## [1] 94.85076

We can observe that Vict Sex contains hidden missing values represented as 0.

head(data_main$'Vict Age'[is.na(data_main$'Vict Descent')], 100)
##   [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##  [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

The occurrence of missing values in Vict Sex and zero values in Vict Age match 99.97% of the time.

 length(intersect(missing_rows_sex, which(data_main$'Vict Age' == 0))) / length(missing_rows_sex) * 100
## [1] 99.97295

Weapon Desc and Weapon Used Cd have exactly matching missing values.

missing_rows_weapon <- which(is.na(data_main$'Weapon Used Cd'))
missing_rows_weapon_desc <- which(is.na(data_main$'Weapon Desc'))

identical(missing_rows_weapon, missing_rows_weapon_desc)
## [1] TRUE

Summary
- Columns Crm Cd 2, Crm Cd 3, Crm Cd 4, and Cross Street contain more than 80% missing values and do not provide valuable information for analysis. These columns can be dropped in the next steps.
- Columns Weapon Desc and Weapon Used Cd have matching occurrences of missing values and are MNAR (Missing Not At Random). This suggests that these columns have values only if the crime involved the use of a weapon.
- Columns Mocodes, Vict Descent, Vict Sex, and Vict Age (with 0 as a missing value) have most occurrences matching, but not exactly. These variables are logically associated with each other; data is missing if there is an inability to identify a victim. So, those values are mostly MNAR but there is posibility of MAR (Missing at random).

Basic dataset processing

# Removing unnecessary columns  
data_main <- data_main[, !names(data_main) %in% c("DR_NO", 
                                                  "Part 1-2",
                                                  "AREA", #the same info as AREA NAME
                                                  "Crm Cd", 
                                                  "Premis Cd",
                                                  "Status", #the same info as Status Desc
                                                  "Mocodes",
                                                  "Crm Cd 1", 
                                                  "Crm Cd 2", 
                                                  "Crm Cd 2",
                                                  "Crm Cd 3", 
                                                  "Crm Cd 4", 
                                                  "Cross Street")]

Merging together DATE OCC (format MM/DD/YYYY) and TIME OCC (format HHMM),

hour <- as.numeric(substr(data_main$'TIME OCC', 1, 2))
minute <- as.numeric(substr(data_main$'TIME OCC', 3, 4))
date <- as.Date(substr(data_main$'DATE OCC', 1, 10) , format = "%m/%d/%Y")
                   
data_main$datetime_occ <- as.POSIXct(paste(date, hour, minute, sep = " "), format = "%Y-%m-%d %H %M")
data_main = data_main[, !names(data_main) %in% c("DATE OCC", "TIME OCC")]

Mapping Vict Descent and Vict Sex . Mapping from LAPD OpenData City of LA.

victim_descent_mapping = c("A" = "Other Asian",
                           "B" = "Black",
                           "C" = "Chinese",
                           "D" = "Cambodian",
                           "F" = "Filipino",
                           "G" = "Guamanian",
                           "H" = "Hispanic/Latin/Mexican",
                           "I" = "Native",
                           "J" = "Japanese",
                           "K" = "Korean",
                           "L" = "Laotian",
                           "O" = "Other",
                           "P" = "Pacific Islander",
                           "S" = "Samoan",
                           "U" = "Hawaiian",
                           "V" = "Vietnamese",
                           "W" = "White",
                           "X" = "Unknown",
                           "Z" = "Asian Indian")

data_main$'Vict Descent' <- factor(data_main$'Vict Descent', levels = names(victim_descent_mapping), labels = victim_descent_mapping)

victim_sex_mapping = c("F" = "Female",
                           "M" = "Male",
                           "H" = "Other",
                           "X" = "Unknown")

data_main$'Vict Sex' <- factor(data_main$'Vict Sex', levels = names(victim_sex_mapping), labels = victim_sex_mapping)

Changing 0 in Vict Age into N/A and converting to date in Date Rptd.

data_main$'Vict Age'[data_main$'Vict Age' <= 0] <- NA
data_main$'Date Rptd' <- as.Date(data_main$'Date Rptd', format = "%m/%d/%Y")

New columns dow_occ day of week of the crime, hod_occ hour of day, week and year of the crime.

data_main$dow_occ <- weekdays(data_main$datetime_occ)
data_main$hod_occ <- hour(data_main$datetime_occ)

data_main$week <- lubridate::week(data_main$datetime_occ)
data_main$year <- lubridate::year(data_main$datetime_occ)

Days between Date Rptd and datetime_occ

data_main$days_diffrence <- as.numeric(data_main$'Date Rptd' - as.Date(data_main$datetime_occ))

Basic visual analysis

Visualising volume of crimes by LA area.

barplot(sort(table(data_main$'AREA NAME'), TRUE), 
        main = "Volume of crimes by LA area ",
        ylab = "Count",
        col = "skyblue",        
        border = "black",       
        las = 2,                
        cex.names = 0.8,        
        cex.lab = 1,          
        cex.main = 1.4,         
        beside = TRUE           
)


Top 25 crime types and locations in LA.

for (val in list(list("crime type", "Crm Cd Desc"), list('location', "Premis Desc"))){
  par(mar = c(5, 15, 1, 1))
  options(repr.plot.height=100) 
  
  barplot(tail(sort(table(data_main[[val[[2]]]])),25), 
          main = paste("Volume of crimes by", val[[1]], "top 25"),
          horiz = TRUE,
          xlab = "Count",
          col = "skyblue",        
          border = "black",       
          las = 2,                
          cex.names = 0.4,        
          cex.lab = 0.5,          
          cex.main = 1,         
          beside = TRUE,
  )
}


Volume of crimes by victim descent.

par(mar = c(5, 10, 1, 1))
barplot(sort(table(subset(data_main, 'Vict Descent' > 0)$'Vict Descent')), 
        main = "Volume of crimes by victim descent",
        horiz = TRUE,
        xlab = "Count",
        col = "skyblue",        
        border = "black",       
        las = 2,                
        cex.names = 0.9,        
        cex.lab = 0.5,          
        cex.main = 1,         
        beside = TRUE,
)


Volume of crimes by victim descent and victim sex.

df_vict <- na.omit(data_main[, c("Vict Descent", "Vict Sex", "Vict Age")])
df_vict$"Vict Descent" <- as.character(df_vict$"Vict Descent")
names(df_vict) <- c("vict_descent", "vict_sex", "vict_age")
df_vict <- df_vict %>%filter(vict_sex != "Unknown")


ggplot(df_vict, aes(x =  factor(vict_descent, levels = names(sort(table(vict_descent), decreasing = TRUE))), fill = vict_sex)) +
  geom_bar(position = "dodge", color = "black", alpha = 0.8) +
  labs(title = "Volume of crimes with victim descent breakdown by sex", x = "Victim descent", y = "Count") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.title = element_blank(),
    legend.position = "top"
) 

ggplot(df_vict, aes(x = vict_age, group = vict_sex, color = vict_sex)) +
  geom_line(stat = "count") +
  labs(title = "Count of victims by age and sex", x = "Age", y = "Count") +
  theme_minimal()

ggplot(df_vict, aes(x = vict_sex, y = vict_age, fill = vict_sex)) +
  geom_boxplot() +
  labs(title = "Victim age by sex", x = "Sex", y = "Age") +
  theme_minimal()


Crimes volume in time - hour of day and day of weeks.

dow_hod_df <- data_main %>%
  group_by(dow_occ, hod_occ) %>%
  summarise(Count = n())%>%
  filter(!is.na(dow_occ))

dow_hod_df$dow_occ <- factor(dow_hod_df$dow_occ, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), ordered = TRUE)

ggplot(dow_hod_df, aes(x = dow_occ, y = hod_occ, fill = Count)) +
  geom_tile() +
  scale_fill_gradient(low = "yellow", high = "red") + 
  labs(title = "Heatmap of crimes frequency by hour of day and day of week", x = "day of week", y = "hour of day", fill = "frequency") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) 


Analysis of number of days between date reported and day of crime occurrence.

density_days_diffrence <- density(na.omit(as.numeric(data_main$days_diffrence)))
ggplot(data.frame(x = density_days_diffrence$x, y = density_days_diffrence$y), aes(x = x, y = y)) +
  geom_line() +
  labs(title = "Density Plot of number of days between date reported and day of crime occurrence", x = "days between", y = "Density")


Crime time series visualization with trend.

weekly_counts <- data_main %>%
  group_by(year, week) %>%
  summarise(count = n())%>%
  filter(!(year==2024&week>4))

ggplot(weekly_counts, aes(x = as.Date(paste(year, week, "1", sep = "-"), format = "%Y-%U-%u"), y = count)) +
  geom_line() +
  geom_smooth(method = "lm", se = FALSE, color = "red", size = 0.3) +
  labs(title = "Weekly crimes count with trend", x = "Date", y = "Number of Records") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b %Y") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) 


Weapon analysis - top 20 weapons.

#showing action taken 
par(mar = c(5, 20, 1, 1))
barplot(tail(sort(table(na.omit(data_main$'Weapon Desc')), FALSE),20), 
        main = "Crimes with weapon by type",
        xlab = "Count",
        horiz = TRUE,
        col = "skyblue",        
        border = "black",       
        las = 1,                
        cex.names = 0.6,        
        cex.lab = 1,          
        cex.main = 1.4,         
        beside = TRUE           
)


Volume of crimes by crime status

par(mar = c(5, 6, 1, 1))
barplot(sort(table(data_main$'Status Desc')), 
        main = "Volume of crimes by action taken",
        horiz = TRUE,
        xlab = "Count",
        col = "skyblue",        
        border = "black",       
        las = 2,                
        cex.names = 1,        
        cex.lab = 0.5,          
        cex.main = 1,         
        beside = TRUE,
)

Descriptive analysis

Summary for continuous variables.

descriptive_cont <- data_main[, names(data_main) %in% c("Vict Age", "days_diffrence")]
summary(descriptive_cont)
##     Vict Age      days_diffrence   
##  Min.   :  2.00   Min.   :   0.00  
##  1st Qu.: 28.00   1st Qu.:   0.00  
##  Median : 37.00   Median :   1.00  
##  Mean   : 39.58   Mean   :  11.04  
##  3rd Qu.: 50.00   3rd Qu.:   2.00  
##  Max.   :120.00   Max.   :1483.00  
##  NA's   :224140   NA's   :64

Dispersion measurements for continuous variables.

descriptive_cont <- na.omit(descriptive_cont)
summarise(descriptive_cont,
          variance_vict_age = var(`Vict Age`),
          sd_vict_age = sd(`Vict Age`),
          cv_vict_age = sd(`Vict Age`) / mean(`Vict Age`) * 100,
          range_vict_age = diff(range(`Vict Age`)),
          iqr_vict_age = IQR(`Vict Age`),
          
          variance_days_diffrence = var(`days_diffrence`),
          sd_days_diffrence = sd(`days_diffrence`),
          cv_days_diffrence = sd(`days_diffrence`) / mean(`days_diffrence`) * 100,
          range_days_diffrence = diff(range(`days_diffrence`)),
          iqr_days_diffrence = IQR(`days_diffrence`)
)

Concentration measurements for continuous variables.

summarise(descriptive_cont,
          skewness_vict_age = skewness(`Vict Age`),
          kurtosis_vict_age = kurtosis(`Vict Age`),
          skewness_days_diffrence = skewness(`days_diffrence`),
          kurtosis_days_diffrence = kurtosis(`days_diffrence`)
)

Percentage analysis for selected categorical variables.

calculate_percentage_stats <- function(var) {
  if (length(unique(data_main[[var]])) > 20) {
    data_subset <- subset(data_main, data_main[[var]] %in% 
                            names(sort(table(data_main[[var]]), decreasing = TRUE))[1:20])
    return(sort(table(data_subset[[var]]) / nrow(data_main) * 100, TRUE))
  } 
  else {
    return(sort(table(data_main[[var]]) / nrow(data_main) * 100, TRUE))
  }
}


for (var in c("AREA NAME", "Crm Cd Desc", "Vict Sex", "Vict Descent", "Status Desc", "Weapon Desc", "dow_occ", "Premis Desc")) {
  cat(paste("Percentage statistics for", var, ":\n"))
  print(calculate_percentage_stats(var))
  cat("\n")
}
## Percentage statistics for AREA NAME :
## 
##     Central 77th Street     Pacific   Southwest   Hollywood   Southeast 
##    6.811366    6.263397    5.824619    5.613965    5.267355    5.053789 
##     Olympic N Hollywood      Newton    Wilshire     Rampart     West LA 
##    5.050989    5.005185    4.995554    4.787588    4.723977    4.578726 
##   Northeast    Van Nuys West Valley      Harbor     Topanga  Devonshire 
##    4.303566    4.222149    4.202326    4.117438    4.084512    4.074097 
##     Mission  Hollenbeck 
##    3.980249    3.731631 
## 
## Percentage statistics for Crm Cd Desc :
## 
##                                         VEHICLE - STOLEN 
##                                                10.736068 
##                                 BATTERY - SIMPLE ASSAULT 
##                                                 7.953443 
##                                        THEFT OF IDENTITY 
##                                                 6.198778 
##                                    BURGLARY FROM VEHICLE 
##                                                 6.172461 
##                                                 BURGLARY 
##                                                 6.121057 
##  VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS) 
##                                                 6.063046 
##           ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT 
##                                                 5.700757 
##                       THEFT PLAIN - PETTY ($950 & UNDER) 
##                                                 5.101833 
##                        INTIMATE PARTNER - SIMPLE ASSAULT 
##                                                 4.987379 
##          THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER) 
##                                                 3.842165 
##      THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER) 
##                                                 3.578092 
##                                                  ROBBERY 
##                                                 3.421641 
## THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD 
##                                                 3.313235 
##                 VANDALISM - MISDEAMEANOR ($399 OR UNDER) 
##                                                 2.612399 
##                 SHOPLIFTING - PETTY THEFT ($950 & UNDER) 
##                                                 2.526950 
##                   CRIMINAL THREATS - NO WEAPON DISPLAYED 
##                                                 2.050543 
##                                          BRANDISH WEAPON 
##                                                 1.551962 
##                                              TRESPASSING 
##                                                 1.510414 
##                    INTIMATE PARTNER - AGGRAVATED ASSAULT 
##                                                 1.353963 
##                           VIOLATION OF RESTRAINING ORDER 
##                                                 1.257428 
## 
## Percentage statistics for Vict Sex :
## 
##        Male      Female     Unknown       Other 
## 41.07078463 36.60852874  9.06114002  0.01086306 
## 
## Percentage statistics for Vict Descent :
## 
## Hispanic/Latin/Mexican                  White                  Black 
##           30.488031590           20.252896631           14.101938105 
##                Unknown                  Other            Other Asian 
##            9.978565045            7.902599744            2.195123044 
##                 Korean               Filipino                Chinese 
##            0.543489216            0.427019242            0.395661941 
##               Japanese             Vietnamese                 Native 
##            0.140211931            0.105830890            0.094967825 
##           Asian Indian       Pacific Islander               Hawaiian 
##            0.050171681            0.026653706            0.019934284 
##              Cambodian              Guamanian                Laotian 
##            0.008063306            0.007167383            0.006607431 
##                 Samoan 
##            0.005375537 
## 
## Percentage statistics for Status Desc :
## 
##  Invest Cont  Adult Other Adult Arrest   Juv Arrest    Juv Other          UNK 
## 8.006504e+01 1.076474e+01 8.669062e+00 3.241001e-01 1.766088e-01 4.479614e-04 
## 
## Percentage statistics for Weapon Desc :
## 
## STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE) 
##                                     18.5725933 
##                    UNKNOWN WEAPON/OTHER WEAPON 
##                                      3.7358864 
##                                  VERBAL THREAT 
##                                      2.5381495 
##                                       HAND GUN 
##                                      2.1459593 
##                          SEMI-AUTOMATIC PISTOL 
##                                      0.7772131 
##               KNIFE WITH BLADE 6INCHES OR LESS 
##                                      0.7321930 
##                                UNKNOWN FIREARM 
##                                      0.7000517 
##                                    OTHER KNIFE 
##                                      0.6261381 
##                              MACE/PEPPER SPRAY 
##                                      0.3979017 
##                                        VEHICLE 
##                                      0.3459382 
##                             ROCK/THROWN OBJECT 
##                                      0.2917349 
##                                PIPE/METAL PIPE 
##                                      0.2636253 
##                                         BOTTLE 
##                                      0.2569059 
##                                  FOLDING KNIFE 
##                                      0.2408913 
##                                          STICK 
##                                      0.2403313 
##                                       CLUB/BAT 
##                                      0.2230848 
##                                  KITCHEN KNIFE 
##                                      0.2034865 
##               AIR PISTOL/REVOLVER/RIFLE/BB GUN 
##                                      0.1981109 
##       KNIFE WITH BLADE OVER 6 INCHES IN LENGTH 
##                                      0.1800805 
##                               BLUNT INSTRUMENT 
##                                      0.1500671 
## 
## Percentage statistics for dow_occ :
## 
##    Friday  Saturday Wednesday    Monday  Thursday    Sunday   Tuesday 
##  15.26563  14.65853  14.20755  14.12142  14.08660  13.92197  13.73114 
## 
## Percentage statistics for Premis Desc :
## 
##                                       STREET 
##                                   25.3506978 
##                       SINGLE FAMILY DWELLING 
##                                   16.8786271 
## MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC) 
##                                   12.2250917 
##                                  PARKING LOT 
##                                    6.9354510 
##                               OTHER BUSINESS 
##                                    4.7408879 
##                                     SIDEWALK 
##                                    4.2815034 
##                     VEHICLE, PASSENGER/TRUCK 
##                                    2.9356033 
##                               GARAGE/CARPORT 
##                                    1.9344095 
##                                     DRIVEWAY 
##                                    1.6062777 
##                         RESTAURANT/FAST FOOD 
##                                    1.2756822 
##                             DEPARTMENT STORE 
##                                    1.2588836 
##                 PARKING UNDERGROUND/BUILDING 
##                                    0.8483270 
##                                OTHER PREMISE 
##                                    0.7914359 
##                                       MARKET 
##                                    0.7614225 
##                                        ALLEY 
##                                    0.7134906 
##                              OTHER RESIDENCE 
##                                    0.6897486 
##                              PARK/PLAYGROUND 
##                                    0.6609671 
##                  YARD (RESIDENTIAL/BUSINESS) 
##                                    0.6166189 
##                                  GAS STATION 
##                                    0.5739506 
##                                        HOTEL 
##                                    0.5698070

TL;DR for visual and descriptive analysis
- The most frequent LA crimes occur in the central and 77th street areas, comprising 6.8% and 6.3% of the total, respectively.
- Most crimes occur on the street, accounting for over 25% of all reported incidents.
- The most prevalent type of crime is vehicle theft, representing 10.8% of all reported crimes.
- Victims are predominantly of Hispanic, Latin, or Mexican descent, constituting over 40% of the total. The distribution of sexes is nearly uniform, with the mean age around 40 years old.
- Crimes predominantly occur during nighttime, with a peak on Friday nights, although overall distribution across days of the week is similar.
- There is a visible increasing trend in crime volume since 2020.
- The majority of crimes are reported promptly after they occur.

Relational analysis

Correlogram for a few variables.

data_main <- data_main %>% 
  mutate(is_investigation_cont = ifelse(data_main$"Status Desc" == "Invest Cont", 1, 0),
         is_street = ifelse(data_main$"Premis Desc" == "STREET", 1, 0), 
         is_vehicle_stolen = ifelse(data_main$"Crm Cd Desc" == "VEHICLE - STOLEN", 1, 0),
         is_central = ifelse(data_main$"AREA NAME" == "Central", 1, 0),
         is_77_street = ifelse(data_main$"AREA NAME" == "77th Street", 1, 0), 
         is_male = ifelse(data_main$"Vict Sex" == "Male", 1, 0), 
         is_weapon = ifelse(is.na(data_main$"Weapon Desc"), 0, 1), 
         is_theft_of_identity = ifelse(data_main$"Crm Cd Desc" == "THEFT OF IDENTITY", 1, 0))

corr_data <- data_main[, names(data_main) %in% c("Vict Age", "days_diffrence", "is_investigation_cont", 
                                                "is_street", "is_vehicle_stolen", "is_weapon",
                                                "is_central", "is_77_street", "is_male", "is_theft_of_identity")]

corr_data <- na.omit(corr_data[sample(nrow(corr_data), 10000), ])
correlation_matrix <- cor(corr_data, use = "complete.obs")
correlation_matrix[is.na(correlation_matrix)] = 0

ggplotly(ggcorrplot(
  correlation_matrix, hc.order = TRUE, type = "lower",
  outline.col = "white"
))

Geospatial analysis

Showing example 100 crimes with description

m <- leaflet() %>%
  setView(lng = -118.2437, lat = 34.0522, zoom = 10) %>%
  addTiles()
for (i in 1:nrow(tail(data_main, 100))) {
  m <- addMarkers(m, lng = data_main$LON[i], lat = data_main$LAT[i], popup = data_main$'Crm Cd Desc'[i])
}
m

Showing heatmap of crimes locations.

data_main <- subset(data_main, (data_main$LON != 0) & (data_main$LAT != 0))
kde <- bkde2D(cbind(data_main$LON, data_main$LAT),
              bandwidth=c(.0045, .0068), gridsize = c(1e3,1e3))
KernelDensityRaster <- raster(list(x=kde$x1 ,y=kde$x2 ,z = kde$fhat))
KernelDensityRaster@data@values[which(KernelDensityRaster@data@values < 1)] <- NA
 
palRaster <- colorNumeric("YlOrRd", domain = KernelDensityRaster@data@values, na.color = "transparent")

leaflet() %>%
  setView(lng = -118.2437, lat = 34.0522, zoom = 10) %>% 
  addTiles() %>% 
  addRasterImage(KernelDensityRaster, 
                 colors = palRaster, 
                 opacity = .8) %>%
  addLegend(pal = palRaster, 
            values = KernelDensityRaster@data@values, 
            title = "Density of Crimes")

Showing example 1000 crimes with weapon

df_weapon <- na.omit(data_main[, c("Weapon Desc", "LAT", "LON")])
m <- leaflet() %>%
  setView(lng = -118.2437, lat = 34.0522, zoom = 10) %>%
  addTiles()
for (i in 1:nrow(df_weapon %>% sample_n(1000))) {
  m <- addMarkers(m, lng = df_weapon$LON[i], lat = df_weapon$LAT[i], popup = df_weapon$'Weapon Desc'[i])
}
m